library(NbClust)
library(factoextra)
library(ClusterR)
library(ggplot2)
library(scatterplot3d)
library(dplyr)
library(stringr)
library(VIM)
library(fpc)
library(plotly)
library(clValid)

Loading Datasets

data1 <- read.csv('Data1.csv')
data2 <- read.csv('Data2.csv')
data3 <- read.csv('Data3.csv')
data4 <- read.csv('Data4.csv')
data5 <- read.csv('Data5.csv')
data6 <- read.csv('Data6.csv')
data7 <- read.csv('Data7.csv')
data8 <- read.csv('Data8.csv')
world_data <- read.csv('World Indicators.csv')

Dataset 1

We check K value recommendation from elbow method and silhouette method

set.seed(10)
fviz_nbclust(data1[,2:4], kmeans, method = 'wss')

fviz_nbclust(data1[,2:4], kmeans, method = 'silhouette')

K suggested by elbow and silhouette method = 6

#Setting k = 6 from silhouette recommendation
k <- 6
#Checking CH value for k values around the suggested k-value
print("K CH Value")
## [1] "K CH Value"
for(i in (k-2):(k+2)){
  km <- kmeans(data1[, 2:4], i, nstart = 20)
  ch <- round(calinhara(data1[, 2:4],km$cluster),digits=2)
  print(paste(i,ch))
}
## [1] "4 98.68"
## [1] "5 146.82"
## [1] "6 262.71"
## [1] "7 225.1"
## [1] "8 478.81"

CH value for k = 7 is greater than CH value for k = 6, so we select k = 6

km <- kmeans(data1[, 2:4], 7, nstart = 20)
data1$kmeans_cluster <- km$cluster
#Display external_validation stats
external_validation(true_labels = data1$Class, clusters = data1$kmeans_cluster, method = 'jaccard_index', summary_stats = T)
##  
## ---------------------------------------- 
## purity                         : 1 
## entropy                        : 0 
## normalized mutual information  : 1 
## variation of information       : 0 
## normalized var. of information : 0 
## ---------------------------------------- 
## specificity                    : 1 
## sensitivity                    : 1 
## precision                      : 1 
## recall                         : 1 
## F-measure                      : 1 
## ---------------------------------------- 
## accuracy OR rand-index         : 1 
## adjusted-rand-index            : 1 
## jaccard-index                  : 1 
## fowlkes-mallows-index          : 1 
## mirkin-metric                  : 0 
## ----------------------------------------
## [1] 1

Jaccard index = 1, purity = 1 for our clustering solution

dist_matrix <- as.matrix(dist(data1[,2:4]))
hc.single <- hclust(dist(data1[,2:4]), method = 'single')
plot(hc.single)

From dendrogram, we can set number of clusters = 7

data1$h_cluster <- cutree(hc.single, 7)
external_validation(true_labels = data1$Class, clusters = data1$h_cluster, method = 'jaccard_index', summary_stats = T)
##  
## ---------------------------------------- 
## purity                         : 1 
## entropy                        : 0 
## normalized mutual information  : 1 
## variation of information       : 0 
## normalized var. of information : 0 
## ---------------------------------------- 
## specificity                    : 1 
## sensitivity                    : 1 
## precision                      : 1 
## recall                         : 1 
## F-measure                      : 1 
## ---------------------------------------- 
## accuracy OR rand-index         : 1 
## adjusted-rand-index            : 1 
## jaccard-index                  : 1 
## fowlkes-mallows-index          : 1 
## mirkin-metric                  : 0 
## ----------------------------------------
## [1] 1

Jaccard index = 1 and purity = 1 for our hierarchichal clustering

Plotting data according to actual class, Kmeans clustering and Hierarchical clustering

#3d plotting according to actual class
fig <- plot_ly(data1, x = ~X1, y = ~X2, z = ~X3, color = ~Class)
fig <- fig %>% add_markers()
fig <- fig %>% layout(scene = list(xaxis = list(title = 'X1'),
                     yaxis = list(title = 'X2'),
                     zaxis = list(title = 'X3')))
fig
#3d plotting according to Kmeans cluster

fig <- plot_ly(data1, x = ~X1, y = ~X2, z = ~X3, color = ~kmeans_cluster)
fig <- fig %>% add_markers()
fig <- fig %>% layout(scene = list(xaxis = list(title = 'X1'),
                     yaxis = list(title = 'X2'),
                     zaxis = list(title = 'X3')))
fig
#3d plotting according to hierarchical cluster
fig <- plot_ly(data1, x = ~X1, y = ~X2, z = ~X3, color = ~h_cluster)
fig <- fig %>% add_markers()
fig <- fig %>% layout(scene = list(xaxis = list(title = 'X1'),
                     yaxis = list(title = 'X2'),
                     zaxis = list(title = 'X3')))
fig

Dataset 2

We check K value recommendation from elbow method and silhouette method

set.seed(10)
fviz_nbclust(data2[,2:4], kmeans, method = 'wss')

fviz_nbclust(data2[,2:4], kmeans, method = 'silhouette')

K suggested by elbow and silhouette method = 4

#Setting k = 6 from silhouette recommendation
k <- 4
#Checking CH value for k values around the suggested k-value
print("K CH Value")
## [1] "K CH Value"
for(i in (k-2):(k+2)){
  km <- kmeans(data2[, 2:4], i, nstart = 20)
  ch <- round(calinhara(data2[, 2:4],km$cluster),digits=2)
  print(paste(i,ch))
}
## [1] "2 411.4"
## [1] "3 487.11"
## [1] "4 677.98"
## [1] "5 697.03"
## [1] "6 682.36"

CH value for k = 5 is greater than k = 4, external validation (jaccard_index) is also higher for k=5

k <- 5
km <- kmeans(data2[, 2:4], k, nstart = 20)
data2$kmeans_cluster <- km$cluster
#Display external_validation stats
external_validation(true_labels = data2$Class, clusters = data2$kmeans_cluster, method = 'jaccard_index', summary_stats = T)
##  
## ---------------------------------------- 
## purity                         : 0.9876 
## entropy                        : 0.3722 
## normalized mutual information  : 0.7849 
## variation of information       : 0.8177 
## normalized var. of information : 0.354 
## ---------------------------------------- 
## specificity                    : 0.9933 
## sensitivity                    : 0.5917 
## precision                      : 0.9808 
## recall                         : 0.5917 
## F-measure                      : 0.7381 
## ---------------------------------------- 
## accuracy OR rand-index         : 0.8463 
## adjusted-rand-index            : 0.6385 
## jaccard-index                  : 0.5849 
## fowlkes-mallows-index          : 0.7618 
## mirkin-metric                  : 25030 
## ----------------------------------------
## [1] 0.5849363

Jaccard index = 0.58, purity = 0.98 for our clustering solution

dist_matrix <- as.matrix(dist(data2[,2:4]))
hc.single <- hclust(dist(data2[,2:4]), method = 'centroid')
plot(hc.single)

> Using “method” as “centroid” gives better clustering results than “single”. Observing the dengrogram, we can cut the dendrogram to create 4 clusters

data2$h_cluster <- cutree(hc.single, 4)
external_validation(true_labels = data2$Class, clusters = data2$h_cluster, method = 'jaccard_index', summary_stats = T)
##  
## ---------------------------------------- 
## purity                         : 1 
## entropy                        : 0 
## normalized mutual information  : 1 
## variation of information       : 0 
## normalized var. of information : 0 
## ---------------------------------------- 
## specificity                    : 1 
## sensitivity                    : 1 
## precision                      : 1 
## recall                         : 1 
## F-measure                      : 1 
## ---------------------------------------- 
## accuracy OR rand-index         : 1 
## adjusted-rand-index            : 1 
## jaccard-index                  : 1 
## fowlkes-mallows-index          : 1 
## mirkin-metric                  : 0 
## ----------------------------------------
## [1] 1

Jaccard index = 1 and purity = 1 for our hierarchichal clustering

Plotting data according to actual class, Kmeans clustering and Hierarchical clustering

#3d plotting according to actual class
fig <- plot_ly(data2, x = ~X, y = ~Y, z = ~C, color = ~Class)
fig <- fig %>% add_markers()
fig <- fig %>% layout(scene = list(xaxis = list(title = 'X'),
                     yaxis = list(title = 'Y'),
                     zaxis = list(title = 'C')))
fig
#3d plotting according to Kmeans cluster

fig <- plot_ly(data2, x = ~X, y = ~Y, z = ~C, color = ~kmeans_cluster)
fig <- fig %>% add_markers()
fig <- fig %>% layout(scene = list(xaxis = list(title = 'X'),
                     yaxis = list(title = 'Y'),
                     zaxis = list(title = 'C')))
fig
#3d plotting according to hierarchical cluster
fig <- plot_ly(data2, x = ~X, y = ~Y, z = ~C, color = ~h_cluster)
fig <- fig %>% add_markers()
fig <- fig %>% layout(scene = list(xaxis = list(title = 'X'),
                     yaxis = list(title = 'Y'),
                     zaxis = list(title = 'C')))
fig

Dataset 3

We check K value recommendation from elbow method and silhouette method

set.seed(10)
fviz_nbclust(data3[,2:4], kmeans, method = 'wss')

fviz_nbclust(data3[,2:4], kmeans, method = 'silhouette')

K suggested by elbow and silhouette method = 4

#Setting k = 6 from silhouette recommendation
k <- 4
#Checking CH value for k values around the suggested k-value
print("K CH Value")
## [1] "K CH Value"
for(i in (k-2):(k+2)){
  km <- kmeans(data3[, 2:4], i, nstart = 20)
  ch <- round(calinhara(data3[, 2:4],km$cluster),digits=2)
  print(paste(i,ch))
}
## [1] "2 137.95"
## [1] "3 204.3"
## [1] "4 418.39"
## [1] "5 344.24"
## [1] "6 303.4"

CH value for k = 4 highest

k <- 4
km <- kmeans(data3[, 2:4], k, nstart = 20)
data3$kmeans_cluster <- km$cluster
#Display external_validation stats
external_validation(true_labels = data3$Class, clusters = data3$kmeans_cluster, method = 'jaccard_index', summary_stats = T)
##  
## ---------------------------------------- 
## purity                         : 1 
## entropy                        : 0 
## normalized mutual information  : 1 
## variation of information       : 0 
## normalized var. of information : 0 
## ---------------------------------------- 
## specificity                    : 1 
## sensitivity                    : 1 
## precision                      : 1 
## recall                         : 1 
## F-measure                      : 1 
## ---------------------------------------- 
## accuracy OR rand-index         : 1 
## adjusted-rand-index            : 1 
## jaccard-index                  : 1 
## fowlkes-mallows-index          : 1 
## mirkin-metric                  : 0 
## ----------------------------------------
## [1] 1

Jaccard index = 1, purity = 1 for our clustering solution. We have perfect clustering

dist_matrix <- as.matrix(dist(data3[,2:4]))
hc.single <- hclust(dist(data3[,2:4]), method = 'centroid')
plot(hc.single)

> Using “method” as “centroid” gives better clustering results than “single”. Observing the dengrogram, we can cut the dendrogram to create 4 clusters

data3$h_cluster <- cutree(hc.single, 4)
external_validation(true_labels = data3$Class, clusters = data3$h_cluster, method = 'jaccard_index', summary_stats = T)
##  
## ---------------------------------------- 
## purity                         : 0.9875 
## entropy                        : 0.0358 
## normalized mutual information  : 0.964 
## variation of information       : 0.1441 
## normalized var. of information : 0.0696 
## ---------------------------------------- 
## specificity                    : 0.9917 
## sensitivity                    : 0.976 
## precision                      : 0.9748 
## recall                         : 0.976 
## F-measure                      : 0.9754 
## ---------------------------------------- 
## accuracy OR rand-index         : 0.9878 
## adjusted-rand-index            : 0.9673 
## jaccard-index                  : 0.952 
## fowlkes-mallows-index          : 0.9754 
## mirkin-metric                  : 1950 
## ----------------------------------------
## [1] 0.9519704

Jaccard index = 95 and purity = 0.98 for our hierarchichal clustering

Plotting data according to actual class, Kmeans clustering and Hierarchical clustering

#3d plotting according to actual class
fig <- plot_ly(data3, x = ~X1, y = ~X2, z = ~X3, color = ~Class)
fig <- fig %>% add_markers()
fig <- fig %>% layout(scene = list(xaxis = list(title = 'X1'),
                     yaxis = list(title = 'X2'),
                     zaxis = list(title = 'X3')))
fig
#3d plotting according to Kmeans cluster

fig <- plot_ly(data3, x = ~X1, y = ~X2, z = ~X3, color = ~kmeans_cluster)
fig <- fig %>% add_markers()
fig <- fig %>% layout(scene = list(xaxis = list(title = 'X1'),
                     yaxis = list(title = 'X2'),
                     zaxis = list(title = 'X3')))
fig
#3d plotting according to hierarchical cluster
fig <- plot_ly(data3, x = ~X1, y = ~X2, z = ~X3, color = ~h_cluster)
fig <- fig %>% add_markers()
fig <- fig %>% layout(scene = list(xaxis = list(title = 'X1'),
                     yaxis = list(title = 'X2'),
                     zaxis = list(title = 'X3')))
fig

A few points are misclustered in cluster 3 and 2 for hierarchical clustering

Dataset 4

We check K value recommendation from elbow method and silhouette method

set.seed(10)
fviz_nbclust(data4[,2:4], kmeans, method = 'wss')

fviz_nbclust(data4[,2:4], kmeans, method = 'silhouette')

K suggested by elbow and silhouette method = 4

#Setting k = 8 from silhouette recommendation
k <- 8
#Checking CH value for k values around the suggested k-value
print("K CH Value")
## [1] "K CH Value"
for(i in (k-6):(k+5)){
  km <- kmeans(data4[, 2:4], i, nstart = 20)
  ch <- round(calinhara(data4[, 2:4],km$cluster),digits=2)
  print(paste(i,ch))
}
## [1] "2 730.57"
## [1] "3 588.92"
## [1] "4 615.13"
## [1] "5 664.76"
## [1] "6 711.54"
## [1] "7 799.18"
## [1] "8 918"
## [1] "9 967.82"
## [1] "10 1070.31"
## [1] "11 1153.38"
## [1] "12 1296.86"
## [1] "13 1385.65"

CH value keeps increasing, CH value for k=2 (actual k) is significantly less. Keeping k = 8

k <- 2
km <- kmeans(data4[, 2:4], k, nstart = 20)
data4$kmeans_cluster <- km$cluster
#Display external_validation stats
external_validation(true_labels = data4$Class, clusters = data4$kmeans_cluster, method = 'jaccard_index', summary_stats = T)
##  
## ---------------------------------------- 
## purity                         : 0.653 
## entropy                        : 0.9314 
## normalized mutual information  : 0.0686 
## variation of information       : 1.8627 
## normalized var. of information : 0.9645 
## ---------------------------------------- 
## specificity                    : 0.5468 
## sensitivity                    : 0.5459 
## precision                      : 0.5459 
## recall                         : 0.5459 
## F-measure                      : 0.5459 
## ---------------------------------------- 
## accuracy OR rand-index         : 0.5464 
## adjusted-rand-index            : 0.0927 
## jaccard-index                  : 0.3754 
## fowlkes-mallows-index          : 0.5459 
## mirkin-metric                  : 453182 
## ----------------------------------------
## [1] 0.3754314

Jaccard index = 0.25, purity = 1 for our clustering solution, which suggests we have many clusters that have high cohesion, but overall clustering is not that good. For k = 2, we have a higher CH (0.37) and lower purity (0.65).

dist_matrix <- as.matrix(dist(data4[,2:4]))
hc.single <- hclust(dist(data4[,2:4]), method = 'single')
plot(hc.single)

> Using “method” as “single” gives better clustering results than “centroid” in this example. Observing the dengrogram, we can cut the dendrogram to create 2 clusters.

data4$h_cluster <- cutree(hc.single, 2)
external_validation(true_labels = data4$Class, clusters = data4$h_cluster, method = 'jaccard_index', summary_stats = T)
##  
## ---------------------------------------- 
## purity                         : 1 
## entropy                        : 0 
## normalized mutual information  : 1 
## variation of information       : 0 
## normalized var. of information : 0 
## ---------------------------------------- 
## specificity                    : 1 
## sensitivity                    : 1 
## precision                      : 1 
## recall                         : 1 
## F-measure                      : 1 
## ---------------------------------------- 
## accuracy OR rand-index         : 1 
## adjusted-rand-index            : 1 
## jaccard-index                  : 1 
## fowlkes-mallows-index          : 1 
## mirkin-metric                  : 0 
## ----------------------------------------
## [1] 1

Jaccard index = 1 and purity = 1 for our hierarchichal clustering, which is perfect clustering and significantly better than K-means. Hierarchical clustering outperforms K-means here.

Plotting data according to actual class, Kmeans clustering and Hierarchical clustering

#3d plotting according to actual class
fig <- plot_ly(data4, x = ~X1, y = ~X2, z = ~X3, color = ~Class)
fig <- fig %>% add_markers()
fig <- fig %>% layout(scene = list(xaxis = list(title = 'X1'),
                     yaxis = list(title = 'X2'),
                     zaxis = list(title = 'X3')))
fig
#3d plotting according to Kmeans cluster

fig <- plot_ly(data4, x = ~X1, y = ~X2, z = ~X3, color = ~kmeans_cluster)
fig <- fig %>% add_markers()
fig <- fig %>% layout(scene = list(xaxis = list(title = 'X1'),
                     yaxis = list(title = 'X2'),
                     zaxis = list(title = 'X3')))
fig
#3d plotting according to hierarchical cluster
fig <- plot_ly(data4, x = ~X1, y = ~X2, z = ~X3, color = ~h_cluster)
fig <- fig %>% add_markers()
fig <- fig %>% layout(scene = list(xaxis = list(title = 'X1'),
                     yaxis = list(title = 'X2'),
                     zaxis = list(title = 'X3')))
fig

Dataset 5

We check K value recommendation from elbow method and silhouette method

set.seed(10)
fviz_nbclust(data5[,2:4], kmeans, method = 'wss')

fviz_nbclust(data5[,2:4], kmeans, method = 'silhouette')

K suggested by elbow and silhouette method = 10

#Setting k = 8 from silhouette recommendation
k <- 10
#Checking CH value for k values around the suggested k-value
print("K CH Value")
## [1] "K CH Value"
for(i in (k-8):(k+5)){
  km <- kmeans(data5[, 2:4], i, nstart = 20)
  ch <- round(calinhara(data5[, 2:4],km$cluster),digits=2)
  print(paste(i,ch))
}
## [1] "2 273.1"
## [1] "3 294.1"
## [1] "4 286.54"
## [1] "5 308.17"
## [1] "6 339.2"
## [1] "7 375.92"
## [1] "8 379.02"
## [1] "9 382.53"
## [1] "10 398.09"
## [1] "11 395.28"
## [1] "12 409.99"
## [1] "13 383.8"
## [1] "14 368.75"
## [1] "15 370.91"

Like in Dataset 4, CH value keeps increasing, CH value for k=2 (actual k) is significantly less. Keeping k = 10

k <- 10
km <- kmeans(data5[, 2:4], k, nstart = 20)
data5$kmeans_cluster <- km$cluster
#Display external_validation stats
external_validation(true_labels = data5$Class, clusters = data5$kmeans_cluster, method = 'jaccard_index', summary_stats = T)
##  
## ---------------------------------------- 
## purity                         : 1 
## entropy                        : 1.5496 
## normalized mutual information  : 0.5634 
## variation of information       : 1.5496 
## normalized var. of information : 0.6078 
## ---------------------------------------- 
## specificity                    : 1 
## sensitivity                    : 0.5602 
## precision                      : 1 
## recall                         : 0.5602 
## F-measure                      : 0.7181 
## ---------------------------------------- 
## accuracy OR rand-index         : 0.7804 
## adjusted-rand-index            : 0.5605 
## jaccard-index                  : 0.5602 
## fowlkes-mallows-index          : 0.7485 
## mirkin-metric                  : 140380 
## ----------------------------------------
## [1] 0.560213

Jaccard index = 0.55, purity = 1 for our clustering solution, which suggests we have many clusters that have high cohesion, but overall clustering is not that good. For k = 2, we have a lower jaccard index (0.48) and lower purity (0.71). This is due to structure of data and kmeans is not able to find proper clustering in this dataset which will be clear from visualisation

dist_matrix <- as.matrix(dist(data5[,2:4]))
hc.single <- hclust(dist(data5[,2:4]), method = 'single')
plot(hc.single)

> Using “method” as “single” gives better clustering results than “centroid” in this example. Observing the dengrogram, we can cut the dendrogram to create 2 clusters.

data5$h_cluster <- cutree(hc.single, 2)
external_validation(true_labels = data5$Class, clusters = data5$h_cluster, method = 'jaccard_index', summary_stats = T)
##  
## ---------------------------------------- 
## purity                         : 1 
## entropy                        : 0 
## normalized mutual information  : 1 
## variation of information       : 0 
## normalized var. of information : 0 
## ---------------------------------------- 
## specificity                    : 1 
## sensitivity                    : 1 
## precision                      : 1 
## recall                         : 1 
## F-measure                      : 1 
## ---------------------------------------- 
## accuracy OR rand-index         : 1 
## adjusted-rand-index            : 1 
## jaccard-index                  : 1 
## fowlkes-mallows-index          : 1 
## mirkin-metric                  : 0 
## ----------------------------------------
## [1] 1

Jaccard index = 1 and purity = 1 for our hierarchichal clustering, which is perfect clustering and significantly better than K-means. Hierarchical clustering outperforms K-means here.

Plotting data according to actual class, Kmeans clustering and Hierarchical clustering

#3d plotting according to actual class
fig <- plot_ly(data5, x = ~X1, y = ~X2, z = ~X3, color = ~Class)
fig <- fig %>% add_markers()
fig <- fig %>% layout(scene = list(xaxis = list(title = 'X1'),
                     yaxis = list(title = 'X2'),
                     zaxis = list(title = 'X3')))
fig
#3d plotting according to Kmeans cluster

fig <- plot_ly(data5, x = ~X1, y = ~X2, z = ~X3, color = ~kmeans_cluster)
fig <- fig %>% add_markers()
fig <- fig %>% layout(scene = list(xaxis = list(title = 'X1'),
                     yaxis = list(title = 'X2'),
                     zaxis = list(title = 'X3')))
fig
#3d plotting according to hierarchical cluster
fig <- plot_ly(data5, x = ~X1, y = ~X2, z = ~X3, color = ~h_cluster)
fig <- fig %>% add_markers()
fig <- fig %>% layout(scene = list(xaxis = list(title = 'X1'),
                     yaxis = list(title = 'X2'),
                     zaxis = list(title = 'X3')))
fig

Dataset 6

We check K value recommendation from elbow method and silhouette method

set.seed(10)
fviz_nbclust(data6[,2:3], kmeans, method = 'wss')

fviz_nbclust(data6[,2:3], kmeans, method = 'silhouette')

K suggested by elbow and silhouette method = 3

#Setting k = 3 from silhouette recommendation
k <- 3
#Checking CH value for k values around the suggested k-value
print("K CH Value")
## [1] "K CH Value"
for(i in (k-1):(k+2)){
  km <- kmeans(data6[, 2:3], i, nstart = 20)
  ch <- round(calinhara(data6[, 2:3],km$cluster),digits=2)
  print(paste(i,ch))
}
## [1] "2 3191.71"
## [1] "3 3981.74"
## [1] "4 3706.8"
## [1] "5 3713.51"

CH value for K = 3 is greater than K=2

k <- 2
km <- kmeans(data6[, 2:3], k, nstart = 20)
data6$kmeans_cluster <- km$cluster
#Display external_validation stats
external_validation(true_labels = data6$Class, clusters = data6$kmeans_cluster, method = 'jaccard_index', summary_stats = T)
##  
## ---------------------------------------- 
## purity                         : 0.9514 
## entropy                        : 0.2691 
## normalized mutual information  : 0.7296 
## variation of information       : 0.5403 
## normalized var. of information : 0.4257 
## ---------------------------------------- 
## specificity                    : 0.9062 
## sensitivity                    : 0.9089 
## precision                      : 0.9064 
## recall                         : 0.9089 
## F-measure                      : 0.9076 
## ---------------------------------------- 
## accuracy OR rand-index         : 0.9075 
## adjusted-rand-index            : 0.8151 
## jaccard-index                  : 0.8309 
## fowlkes-mallows-index          : 0.9076 
## mirkin-metric                  : 1551006 
## ----------------------------------------
## [1] 0.8308878

Jaccard index = 0.59, purity = 0.93 for k = 3, For k = 2, we have a higher jaccard index (0.83) and higher purity (0.95), so we keep k = 2

dist_matrix <- as.matrix(dist(data6[,2:3]))
hc.single <- hclust(dist(data6[,2:3]), method = 'complete')
plot(hc.single)

> Observing the dengrogram, we can cut the dendrogram to create 2 clusters.

data6$h_cluster <- cutree(hc.single, 2)
external_validation(true_labels = data6$Class, clusters = data6$h_cluster, method = 'jaccard_index', summary_stats = T)
##  
## ---------------------------------------- 
## purity                         : 0.6008 
## entropy                        : 0.3747 
## normalized mutual information  : 0.1394 
## variation of information       : 1.2717 
## normalized var. of information : 0.9251 
## ---------------------------------------- 
## specificity                    : 0.2048 
## sensitivity                    : 0.8358 
## precision                      : 0.5123 
## recall                         : 0.8358 
## F-measure                      : 0.6353 
## ---------------------------------------- 
## accuracy OR rand-index         : 0.5202 
## adjusted-rand-index            : 0.0406 
## jaccard-index                  : 0.4655 
## fowlkes-mallows-index          : 0.6544 
## mirkin-metric                  : 8047470 
## ----------------------------------------
## [1] 0.4654753

Jaccard index = 0.49 and purity = 0.5 for our hierarchichal clustering,

Plotting data according to actual class, Kmeans clustering and Hierarchical clustering

ggplot(data6,
       aes(x = X1,
           y = X2,
           color = as.factor(Class))) +
  geom_point()

ggplot(data6,
       aes(x = X1,
           y = X2,
           color = as.factor(kmeans_cluster))) +
  geom_point()

ggplot(data6,
       aes(x = X1,
           y = X2,
           color = as.factor(h_cluster))) +
  geom_point()

Dataset 7

We check K value recommendation from elbow method and silhouette method

set.seed(10)
fviz_nbclust(data7[,2:3], kmeans, method = 'wss')

fviz_nbclust(data7[,2:3], kmeans, method = 'silhouette')

K suggested by elbow and silhouette method = 6

#Setting k = 3 from silhouette recommendation
k <- 6
#Checking CH value for k values around the suggested k-value
print("K CH Value")
## [1] "K CH Value"
for(i in (k-2):(k+2)){
  km <- kmeans(data6[, 2:3], i, nstart = 20, iter.max = 15)
  ch <- round(calinhara(data6[, 2:3],km$cluster),digits=2)
  print(paste(i,ch))
}
## [1] "4 3706.8"
## [1] "5 3713.51"
## [1] "6 3675.35"
## [1] "7 3627.04"
## [1] "8 3541.43"

CH value for K = 6 is highest

k <- 6
km <- kmeans(data7[, 2:3], k, nstart = 20)
data7$kmeans_cluster <- km$cluster
#Display external_validation stats
external_validation(true_labels = data7$Class, clusters = data7$kmeans_cluster, method = 'jaccard_index', summary_stats = T)
##  
## ---------------------------------------- 
## purity                         : 0.9844 
## entropy                        : 0.4208 
## normalized mutual information  : 0.6367 
## variation of information       : 1.1822 
## normalized var. of information : 0.533 
## ---------------------------------------- 
## specificity                    : 0.9943 
## sensitivity                    : 0.6346 
## precision                      : 0.9905 
## recall                         : 0.6346 
## F-measure                      : 0.7735 
## ---------------------------------------- 
## accuracy OR rand-index         : 0.8199 
## adjusted-rand-index            : 0.6355 
## jaccard-index                  : 0.6307 
## fowlkes-mallows-index          : 0.7928 
## mirkin-metric                  : 106658 
## ----------------------------------------
## [1] 0.6307009

Jaccard index = 0.63, purity = 0.98 for k = 6

dist_matrix <- as.matrix(dist(data7[,2:3]))
hc.single <- hclust(dist(data7[,2:3]), method = 'single')
plot(hc.single)

> Observing the dengrogram, we can cut the dendrogram to create 6 clusters.

data7$h_cluster <- cutree(hc.single, 6)
external_validation(true_labels = data7$Class, clusters = data7$h_cluster, method = 'jaccard_index', summary_stats = T)
##  
## ---------------------------------------- 
## purity                         : 1 
## entropy                        : 0 
## normalized mutual information  : 1 
## variation of information       : 0 
## normalized var. of information : 0 
## ---------------------------------------- 
## specificity                    : 1 
## sensitivity                    : 1 
## precision                      : 1 
## recall                         : 1 
## F-measure                      : 1 
## ---------------------------------------- 
## accuracy OR rand-index         : 1 
## adjusted-rand-index            : 1 
## jaccard-index                  : 1 
## fowlkes-mallows-index          : 1 
## mirkin-metric                  : 0 
## ----------------------------------------
## [1] 1

Jaccard index = 1 and purity = 1 for our hierarchichal clustering, which is perfect clustering.From the visualisations we can see that Hierarchical performs much better than kmeans clustering.

Plotting data according to actual class, Kmeans clustering and Hierarchical clustering

ggplot(data7,
       aes(x = X1,
           y = X2,
           color = as.factor(Class))) +
  geom_point()

ggplot(data7,
       aes(x = X1,
           y = X2,
           color = as.factor(kmeans_cluster))) +
  geom_point()

ggplot(data7,
       aes(x = X1,
           y = X2,
           color = as.factor(h_cluster))) +
  geom_point()

# Dataset 8

We check K value recommendation from elbow method and silhouette method

set.seed(10)
fviz_nbclust(data8[,2:4], kmeans, method = 'wss')

fviz_nbclust(data8[,2:4], kmeans, method = 'silhouette')
## Warning: did not converge in 10 iterations

K suggested by elbow and silhouette method = 6

#Setting k = 6 from silhouette recommendation
k <- 6
#Checking CH value for k values around the suggested k-value
print("K CH Value")
## [1] "K CH Value"
for(i in (k-5):(k+5)){
  km <- kmeans(data8[, 2:4], i, nstart = 20)
  ch <- round(calinhara(data8[, 2:4],km$cluster),digits=2)
  print(paste(i,ch))
}
## [1] "1 NaN"
## [1] "2 1352.03"
## [1] "3 1485.84"
## [1] "4 1665.55"
## [1] "5 1673.8"
## [1] "6 1811.06"
## [1] "7 1788.68"
## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations
## [1] "8 1823.08"
## [1] "9 1834.67"
## [1] "10 1842.62"
## [1] "11 1846.98"

Keeping k = 6

k <- 6
km <- kmeans(data8[, 2:4], k, nstart = 20, iter.max=15)
data8$kmeans_cluster <- km$cluster
#Display external_validation stats
external_validation(true_labels = data8$Class, clusters = data8$kmeans_cluster, method = 'jaccard_index', summary_stats = T)
##  
## ---------------------------------------- 
## purity                         : 1 
## entropy                        : Inf 
## normalized mutual information  : 0 
## variation of information       : 2.585 
## normalized var. of information : 1 
## ---------------------------------------- 
## specificity                    : NaN 
## sensitivity                    : 0.1665 
## precision                      : 1 
## recall                         : 0.1665 
## F-measure                      : 0.2854 
## ---------------------------------------- 
## accuracy OR rand-index         : 0.1665 
## adjusted-rand-index            : 0 
## jaccard-index                  : 0.1665 
## fowlkes-mallows-index          : 0.408 
## mirkin-metric                  : 13346658 
## ----------------------------------------
## [1] 0.1664591

Jaccard index = 0.16, purity = 1 for our clustering solution, which suggests we have many clusters that have high cohesion, but overall clustering is not that good. This is due to structure of data and kmeans is not able to find proper clustering in this dataset which will be clear from visualisation, as the data has only 1 class.

dist_matrix <- as.matrix(dist(data8[,2:4]))
hc.single <- hclust(dist(data8[,2:4]), method = 'single')
plot(hc.single)

> Observing the dengrogram, we can cut the dendrogram to create 2 clusters.

data8$h_cluster <- cutree(hc.single, 2)
external_validation(true_labels = data8$Class, clusters = data8$h_cluster, method = 'jaccard_index', summary_stats = T)
##  
## ---------------------------------------- 
## purity                         : 1 
## entropy                        : Inf 
## normalized mutual information  : 0 
## variation of information       : 0.0034 
## normalized var. of information : 1 
## ---------------------------------------- 
## specificity                    : NaN 
## sensitivity                    : 0.9995 
## precision                      : 1 
## recall                         : 0.9995 
## F-measure                      : 0.9998 
## ---------------------------------------- 
## accuracy OR rand-index         : 0.9995 
## adjusted-rand-index            : 0 
## jaccard-index                  : 0.9995 
## fowlkes-mallows-index          : 0.9998 
## mirkin-metric                  : 8002 
## ----------------------------------------
## [1] 0.9995002

Jaccard index = 0.99 and purity = 1 for our hierarchichal clustering, which significantly better than K-means. Hierarchical clustering outperforms K-means here.

Plotting data according to actual class, Kmeans clustering and Hierarchical clustering

#3d plotting according to actual class
fig <- plot_ly(data8, x = ~X1, y = ~X2, z = ~X3, color = ~Class)
fig <- fig %>% add_markers()
fig <- fig %>% layout(scene = list(xaxis = list(title = 'X1'),
                     yaxis = list(title = 'X2'),
                     zaxis = list(title = 'X3')))
fig
#3d plotting according to Kmeans cluster

fig <- plot_ly(data8, x = ~X1, y = ~X2, z = ~X3, color = ~kmeans_cluster)
fig <- fig %>% add_markers()
fig <- fig %>% layout(scene = list(xaxis = list(title = 'X1'),
                     yaxis = list(title = 'X2'),
                     zaxis = list(title = 'X3')))
fig
#3d plotting according to hierarchical cluster
fig <- plot_ly(data8, x = ~X1, y = ~X2, z = ~X3, color = ~h_cluster)
fig <- fig %>% add_markers()
fig <- fig %>% layout(scene = list(xaxis = list(title = 'X1'),
                     yaxis = list(title = 'X2'),
                     zaxis = list(title = 'X3')))
fig

Task 2

Data preparation

summary(world_data)
##    Birth.Rate      Business.Tax.Rate  Days.to.Start.Business  Energy.Usage    
##  Min.   :0.00800   Length:208         Min.   :  1.00         Min.   :    765  
##  1st Qu.:0.01200   Class :character   1st Qu.:  9.00         1st Qu.:   5699  
##  Median :0.01900   Mode  :character   Median : 18.00         Median :  18082  
##  Mean   :0.02176                      Mean   : 30.33         Mean   :  92238  
##  3rd Qu.:0.02800                      3rd Qu.: 35.00         3rd Qu.:  51557  
##  Max.   :0.05000                      Max.   :694.00         Max.   :2727728  
##  NA's   :9                            NA's   :27             NA's   :72       
##      GDP            Health.Exp...GDP  Health.Exp.Capita  Hours.to.do.Tax 
##  Length:208         Min.   :0.01700   Length:208         Min.   :  12.0  
##  Class :character   1st Qu.:0.04700   Class :character   1st Qu.: 151.8  
##  Mode  :character   Median :0.06500   Mode  :character   Median : 224.0  
##                     Mean   :0.06736                      Mean   : 279.3  
##                     3rd Qu.:0.08500                      3rd Qu.: 327.0  
##                     Max.   :0.17700                      Max.   :2600.0  
##                     NA's   :23                           NA's   :28      
##  Infant.Mortality.Rate Internet.Usage   Lending.Interest Life.Expectancy.Female
##  Min.   :0.00200       Min.   :0.0000   Min.   :0.0050   Min.   :45.00         
##  1st Qu.:0.00700       1st Qu.:0.1000   1st Qu.:0.0745   1st Qu.:67.00         
##  Median :0.01600       Median :0.4000   Median :0.1060   Median :76.00         
##  Mean   :0.02734       Mean   :0.3769   Mean   :0.1226   Mean   :72.75         
##  3rd Qu.:0.04225       3rd Qu.:0.6000   3rd Qu.:0.1500   3rd Qu.:80.00         
##  Max.   :0.11200       Max.   :0.9000   Max.   :0.5250   Max.   :87.00         
##  NA's   :20            NA's   :9        NA's   :77       NA's   :11            
##  Life.Expectancy.Male Mobile.Phone.Usage Population.0.14  Population.15.64
##  Min.   :45.00        Min.   :0.0000     Min.   :0.1190   Min.   :0.4750  
##  1st Qu.:62.00        1st Qu.:0.7000     1st Qu.:0.1890   1st Qu.:0.5810  
##  Median :70.00        Median :1.0000     Median :0.2790   Median :0.6540  
##  Mean   :68.04        Mean   :0.9587     Mean   :0.2870   Mean   :0.6355  
##  3rd Qu.:75.00        3rd Qu.:1.2000     3rd Qu.:0.3825   3rd Qu.:0.6845  
##  Max.   :81.00        Max.   :2.5000     Max.   :0.4990   Max.   :0.8570  
##  NA's   :11           NA's   :12         NA's   :17       NA's   :17      
##  Population.65.    Population.Urban    Region            Country         
##  Min.   :0.00300   Min.   :0.0890   Length:208         Length:208        
##  1st Qu.:0.03400   1st Qu.:0.3777   Class :character   Class :character  
##  Median :0.05600   Median :0.5715   Mode  :character   Mode  :character  
##  Mean   :0.07742   Mean   :0.5767                                        
##  3rd Qu.:0.11800   3rd Qu.:0.7725                                        
##  Max.   :0.23700   Max.   :1.0000                                        
##  NA's   :17        NA's   :2
#Remove commas and $ from GDP and Capita columns
world_data$GDP <- as.factor(gsub(",", "", world_data$GDP))
world_data$GDP <- as.numeric(gsub("\\$", "", world_data$GDP))
world_data$Health.Exp.Capita <- as.factor(gsub(",", "", world_data$Health.Exp.Capita))
world_data$Health.Exp.Capita <- as.numeric(gsub("\\$", "", world_data$Health.Exp.Capita))
#Remove % from Tax rate column
world_data$Business.Tax.Rate <- as.numeric(sub("%", "",world_data$Business.Tax.Rate))

scaled_world_data <- scale(world_data[, 1:18])

scaled_world_data_2 <- world_data[,1:18] %>%
                      mutate_all(~(scale(.) %>% as.vector))
scaled_world_data_2$Region <-  world_data$Region
scaled_world_data_2$Country <- world_data$Country

#Remove energy usage and lending interest as they have high number of NAs
scaled_world_data_2 <- scaled_world_data_2 %>%
                        select(-Energy.Usage, -Lending.Interest)
scaled_world_data_NA <- na.omit(scaled_world_data_2)

Check elbow method and silhouette

set.seed(10)
fviz_nbclust(scaled_world_data_NA[,1:16], kmeans, method = 'wss')

fviz_nbclust(scaled_world_data_NA[,1:16], kmeans, method = 'silhouette')

K suggested by elbow and silhouette method = 2

#Setting k = 2 from silhouette recommendation
k <- 2
#Checking CH value for k values around the suggested k-value
print("K CH Value")
## [1] "K CH Value"
for(i in (k):(k+4)){
  km <- kmeans(scaled_world_data_NA[,1:16], i, nstart = 20)
  ch <- round(calinhara(scaled_world_data_NA[,1:16],km$cluster),digits=2)
  print(paste(i,ch))
}
## [1] "2 106.99"
## [1] "3 80.83"
## [1] "4 66.44"
## [1] "5 60.69"
## [1] "6 58.33"

CH value is highest for K = 2, so we select K = 2

Hierarchical clustering

dist_matrix <- as.matrix(dist(scaled_world_data_NA[,1:16]))
hc.single <- hclust(dist(scaled_world_data_NA[,1:16]), method = 'single')
plot(hc.single)

> Observing the dengrogram, we can cut the dendrogram to create 2 clusters.

scaled_world_data_NA$h_cluster <- cutree(hc.single, 2)